#Web scraping using rvest
table <- html_table(page, fill = TRUE)
for (i in 1:length(table)) {
if (!(i %in% c(2, 13, 14))) {
colnames(table[[i]]) <- table[[i]][1, ]
table[[i]] <- table[[i]][-1, ]
}
}
data <- apply(table[[2]], 2, function(x) as.numeric(gsub(",", ".", x, fixed = TRUE)))
## Warning in FUN(newX[, i], ...): NA introdotti per coercizione
## Warning in FUN(newX[, i], ...): NA introdotti per coercizione
## Warning in FUN(newX[, i], ...): NA introdotti per coercizione
## Warning in FUN(newX[, i], ...): NA introdotti per coercizione
## Warning in FUN(newX[, i], ...): NA introdotti per coercizione
## Warning in FUN(newX[, i], ...): NA introdotti per coercizione
## Warning in FUN(newX[, i], ...): NA introdotti per coercizione
## Warning in FUN(newX[, i], ...): NA introdotti per coercizione
## Warning in FUN(newX[, i], ...): NA introdotti per coercizione
## Warning in FUN(newX[, i], ...): NA introdotti per coercizione
## Warning in FUN(newX[, i], ...): NA introdotti per coercizione
## Warning in FUN(newX[, i], ...): NA introdotti per coercizione
## Warning in FUN(newX[, i], ...): NA introdotti per coercizione
## Warning in FUN(newX[, i], ...): NA introdotti per coercizione
## Warning in FUN(newX[, i], ...): NA introdotti per coercizione
data =as.data.frame(data [-25,])
data $Differenza= data $Rf-data $xG
media <- mean(data $Differenza)
mediana <- median(data $Differenza)
deviazione_standard <- sd(data $Differenza)
cat("Media:", media, "\n")
## Media: 0.2729167
cat("Mediana:", mediana, "\n")
## Mediana: 0.2
cat("Deviazione Standard:", deviazione_standard, "\n")
## Deviazione Standard: 1.140127
expected goal never lie!! the mean is postive so we confirm the overall overperformance, the standard deviation is not really high so we can say that Naples had almost the trend in all the season, in the study below we will focuse on each period to have a better understanding of the season #Analysis of the performance of the team
#remove italy cup data since it has no data
table[[2]] =as.data.frame(table[[2]] [-25,])
table[[2]]$Rf <- as.numeric(table[[2]]$Rf)
table[[2]]$Data <- as.Date(table[[2]]$Data, format = "%d-%m-%Y")
#graphic between Goal made and expected goal
ggplot(data = table[[2]], aes(x = Data, y = Rf - xG)) +
geom_point(color = "blue", size = 3) +
geom_text(aes(label = Avversario), vjust = -0.5, hjust = 1, angle = 90) +
labs(x = "Data", y = "Differenza Gol Fatti - xG") +
ggtitle("Differenza Gol Fatti - xG per ogni Partita") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
geom_smooth(method = "lm", se = FALSE, color = "red") #adding regression line
## `geom_smooth()` using formula = 'y ~ x'
table[[2]]$Rs=as.numeric(table[[2]]$Rs)
#graphic between Goal conceded and expected goal against
ggplot(data = table[[2]], aes(x = Data, y = Rs - xGA)) +
geom_point(color = "red", size = 3) +
geom_text(aes(label = Avversario), vjust = -0.5, hjust = 1, angle = 90) +
labs(x = "Data", y = "Differenza Gol Subiti - xGA") +
ggtitle("Differenza Gol Subiti - xGA per ogni Partita") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))+
geom_smooth(method = "lm", se = FALSE, color = "red")
## `geom_smooth()` using formula = 'y ~ x'
we can see that Naples has an overall good performance about scoring, it
has a mean more of less of 0.27 so at the end of the season the team
scored more than it has created. It started really well, in fact the
intercept is at 0,6 and slope goes down because they won the
championship in advance so they became more relaxed in game. Let’s
analyze for each time of the season how the time performed ###First half
of the season and the second half of the season, let’s see the
difference
ggplot(data = table[[2]][1:26,], aes(x = Data, y = Rf - xG)) +
geom_point(color = "blue", size = 3) +
geom_text(aes(label = Avversario), vjust = -0.5, hjust = 1, angle = 90) +
labs(x = "Data", y = "Differenza Gol Fatti - xG") +
ggtitle("Differenza Gol Fatti - xG per ogni Partita girone andata") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
geom_smooth(method = "lm", se = FALSE, color = "red")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(data = table[[2]][27:nrow(table[[2]]),], aes(x = Data, y = Rf - xG)) +
geom_point(color = "blue", size = 3) +
geom_text(aes(label = Avversario), vjust = -0.5, hjust = 1, angle = 90) +
labs(x = "Data", y = "Differenza Gol Fatti - xG") +
ggtitle("Differenza Gol Fatti - xG per ogni Partita girone ritorno") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
geom_smooth(method = "lm", se = FALSE, color = "red")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(data = table[[2]][1:26,], aes(x = Data, y = Rs - xGA)) +
geom_point(color = "red", size = 3) +
geom_text(aes(label = Avversario), vjust = -0.5, hjust = 1, angle = 90) +
labs(x = "Data", y = "Differenza Gol Subiti - xGA") +
ggtitle("Differenza Gol Subiti - xGA per ogni Partita girone andata") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))+
geom_smooth(method = "lm", se = FALSE, color = "red")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(data = table[[2]][27:nrow(table[[2]]),], aes(x = Data, y = Rs - xGA)) +
geom_point(color = "red", size = 3) +
geom_text(aes(label = Avversario), vjust = -0.5, hjust = 1, angle = 90) +
labs(x = "Data", y = "Differenza Gol Subiti - xGA") +
ggtitle("Differenza Gol Subiti - xGA per ogni Partita girone ritorno") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))+
geom_smooth(method = "lm", se = FALSE, color = "red")
## `geom_smooth()` using formula = 'y ~ x'
we can see that the first half of the season went pretty well, in fact
the Naples lost only two game(Internazionale and Liverpool) and draw two
game as well(Fiorentina and Lecce at the start of the season). In the
second half of the season the intercept became lower because Napoli lost
the unpredictability, teams became more aware of the scheme and key
players, the slope goes down because as I said before when they won the
Championship they became more relaxed. About the defense, we can see
that at the start of the season the team managed to defend very well, in
fact the difference between goal and XAG is negative, while in the
second half the defensive performance became worse and the really
negative match against AC MILAN influence a lot ###World Cup Analysis,
before and after the
ggplot(data = table[[2]][1:21,], aes(x = Data, y = Rf - xG)) +
geom_point(color = "blue", size = 3) +
geom_text(aes(label = Avversario), vjust = -0.5, hjust = 1, angle = 90) +
labs(x = "Data", y = "Differenza Gol Fatti - xG") +
ggtitle("Differenza Gol Fatti - xG per ogni Partita pre mondiale") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
geom_smooth(method = "lm", se = FALSE, color = "red")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(data = table[[2]][22:nrow(table[[2]]),], aes(x = Data, y = Rf - xG)) +
geom_point(color = "blue", size = 3) +
geom_text(aes(label = Avversario), vjust = -0.5, hjust = 1, angle = 90) +
labs(x = "Data", y = "Differenza Gol Fatti - xG") +
ggtitle("Differenza Gol Fatti - xG per ogni Partita post mondiale") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
geom_smooth(method = "lm", se = FALSE, color = "red")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(data = table[[2]][1:21,], aes(x = Data, y = Rs - xGA)) +
geom_point(color = "red", size = 3) +
geom_text(aes(label = Avversario), vjust = -0.5, hjust = 1, angle = 90) +
labs(x = "Data", y = "Differenza Gol Subiti - xGA") +
ggtitle("Differenza Gol Subiti - xGA per ogni Partita pre mondiale") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))+
geom_smooth(method = "lm", se = FALSE, color = "red")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(data = table[[2]][22:nrow(table[[2]]),], aes(x = Data, y = Rs - xGA)) +
geom_point(color = "red", size = 3) +
geom_text(aes(label = Avversario), vjust = -0.5, hjust = 1, angle = 90) +
labs(x = "Data", y = "Differenza Gol Subiti - xGA") +
ggtitle("Differenza Gol Subiti - xGA per ogni Partita post mondiale") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))+
geom_smooth(method = "lm", se = FALSE, color = "red")
## `geom_smooth()` using formula = 'y ~ x'
While the Serie A was in stand by because the World Cup was playing,
people were saying that Naples, after a really good start will be going
down. The start was not the best, losing with Inter in a game when the
attackers had really big problem to score but they managed to come back
in the following games and to make a bigger lead. The match against
Juventus was the best game of the season but in the next game Naples was
able to win difficult matches with a good overperformance thanks to
Victor Osimhen
###AC MILAN 0-4 , before and after is worth to analyze this time line because in my opion the Championship match with Ac MILAN was the reason for the downfall of the good performance of Naples
# Grafico delle differenze tra gol fatti e xG con retta di regressione
ggplot(data = table[[2]][1:35,], aes(x = Data, y = Rf - xG)) +
geom_point(color = "blue", size = 3) +
geom_text(aes(label = Avversario), vjust = -0.5, hjust = 1, angle = 90) +
labs(x = "Data", y = "Differenza Gol Fatti - xG") +
ggtitle("Differenza Gol Fatti - xG per ogni Partita pre Milan 0-4") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
geom_smooth(method = "lm", se = FALSE, color = "red")
## `geom_smooth()` using formula = 'y ~ x'
# Grafico delle differenze tra gol fatti e xG con retta di regressione
ggplot(data = table[[2]][36:nrow(table[[2]]),], aes(x = Data, y =Rf -xG)) +
geom_point(color = "blue", size = 3) +
geom_text(aes(label = Avversario), vjust = -0.5, hjust = 1, angle = 90) +
labs(x = "Data", y = "Differenza Gol Fatti - xG") +
ggtitle("Differenza Gol Fatti - xG per ogni Partita post Milan 0-4") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
geom_smooth(method = "lm", se = FALSE, color = "red")
## `geom_smooth()` using formula = 'y ~ x'
# Grafico delle differenze tra gol subiti e xGA per tutte le partite
ggplot(data = table[[2]][1:35,], aes(x = Data, y =Rs -xGA)) +
geom_point(color = "red", size = 3) +
geom_text(aes(label = Avversario), vjust = -0.5, hjust = 1, angle = 90) +
labs(x = "Data", y = "Differenza Gol Subiti - xGA") +
ggtitle("Differenza Gol Subiti - xGA per ogni Partita pre Milan 0-4") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))+
geom_smooth(method = "lm", se = FALSE, color = "red")
## `geom_smooth()` using formula = 'y ~ x'
# Grafico delle differenze tra gol subiti e xGA per tutte le partite
ggplot(data = table[[2]][36:nrow(table[[2]]),], aes(x = Data, y =Rs -xGA)) +
geom_point(color = "red", size = 3) +
geom_text(aes(label = Avversario), vjust = -0.5, hjust = 1, angle = 90) +
labs(x = "Data", y = "Differenza Gol Subiti - xGA") +
ggtitle("Differenza Gol Subiti - xGA per ogni Partita post Milan 0-4") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))+
geom_smooth(method = "lm", se = FALSE, color = "red")
## `geom_smooth()` using formula = 'y ~ x'
we can see that the intercept after the Milan match became really low,
and in the next 10 games the Naples managed to overperforme only in two
cases, the same goes for the defensive performance, Napoli conceded a
lot more. The analysis above confirm that the match against Milan
influnced the rest of the season, is not easy to withstand a 0-4 in your
home and than be kicked out of Champions League by the hand of the same
team
#Spectators in home game
partite_casa <- subset(table[[2]], Stadio == "Casa")
partite_casa$Data <- as.Date(partite_casa$Data)
ggplot(data = partite_casa, aes(x = Data, y = Spettatori)) +
geom_point(aes(color = Competizione), size = 3) +
geom_text(aes(label = Avversario), vjust = -0.5, hjust = 1, angle = 90)+
scale_color_manual(values = c("Serie A" = "green", "Champions Lg" = "blue")) +
labs(x = "Data", y = "Numero di Spettatori") +
ggtitle("Andamento Numero di Spettatori nelle Partite Stadio Casa") +
theme_minimal()
I wanted to see how many spectators there were in each game, just a
curiosity of mine because at the end of the season it was almost
impossible to find avaible tickets! #Ball Possession
table[[2]]$Data <- as.Date(table[[2]]$Data)
ggplot(data = table[[2]], aes(x = Data, y = Poss., color = Risultato)) +
geom_point(size = 3) +
geom_text(aes(label = Avversario), vjust = -0.5, hjust = 1, angle = 90)+
scale_color_manual(values = c("V" = "green", "P" = "red", "N" = "blue")) +
labs(x = "Data", y = "Possesso palla") +
ggtitle("Andamento del Possesso Palla nel Tempo") +
theme_minimal()
One of the main point of the gameplay of our coach Luciano Spalletti is
keeping the ball as much as we can, in fact Naples has the biggest ball
possession in the whole Serie A and it is one of the best in the top 5
league ###Correlation between ball possession and Winning a game Are
this correlated??Let’s check
table[[2]]$Poss. <- as.numeric(table[[2]]$Poss.)
ggplot(data = table[[2]], aes(x = Poss., y = Risultato)) +
geom_point(color = "blue", size = 3) +
geom_text(aes(label = Avversario), vjust = -0.5, hjust = 1, angle = 90) +
labs(x = "Possesso palla", y = "Differenza Gol Fatti - Gol Subiti") +
ggtitle("Correlazione tra Possesso Palla e Risultato") +
theme_minimal()
table[[2]]$Risultato_numerico <- ifelse(table[[2]]$Risultato == "V", 1,
ifelse(table[[2]]$Risultato == "N" || table[[2]]$Risultato == "P", -1, 0))
## Warning in table[[2]]$Risultato == "N" || table[[2]]$Risultato == "P":
## 'length(x) = 48 > 1' nella coercizione in 'logical(1)'
## Warning in table[[2]]$Risultato == "N" || table[[2]]$Risultato == "P":
## 'length(x) = 48 > 1' nella coercizione in 'logical(1)'
correlation <- cor(table[[2]]$Poss., table[[2]]$Risultato_numerico)
correlation
## [1] -0.08156555
The correlation coefficient of -0.08156555 indicates a very weak negative correlation between ball possession and the result obtained. This value indicates that there is no strong linear relationship between these two variables.
In particular, the negative sign indicates that as ball possession increases, the result obtained tends to be slightly more unfavorable (less frequent victories or more frequent defeats). These could be understood watching the game of Naples, usually we had problem with small team that tend to park a bus in their area, Naples team work really well in vertical, instead in small space we can only count on Kvara performance or really fast pass to get through the defense. In game when Napoli can’t score they tend to keep the ball all the time to find some space to attack while the other team defende in 10 men, that’s why when the ball possession gets to high there is a possibility Naples is losing #Formation along the season Naples tested two formation
frequenza_formazioni <- table(table[[2]]$Formazione)
barplot(frequenza_formazioni, main = "Frequenza delle formazioni", xlab = "Formazione", ylab = "Frequenza")
as you can see in the barplot Naples used mostly the 4-3-3,using
Zielinski as a midfielder, instead 4-2-3-1 was used when not all the
main players were avaible, using Zielinski or Raspadori as a TRQ
library(dplyr)
performance_formazione <- table[[2]] %>%
group_by(Formazione) %>%
summarise(media_gol_fatti = mean(Rf),
deviazione_gol_fatti = sd(Rf),
min_gol_fatti = min(Rf),
max_gol_fatti = max(Rf),
media_gol_subiti = mean(Rs),
deviazione_gol_subiti = sd(Rs),
min_gol_subiti = min(Rs),
max_gol_subiti = max(Rs),
media_possesso_palla = mean(Poss.),
deviazione_possesso_palla = sd(Poss.),
min_possesso_palla = min(Poss.),
max_possesso_palla = max(Poss.))
performance_formazione
## # A tibble: 2 × 13
## Formazione media_gol_fatti deviazione_gol_fatti min_gol_fatti max_gol_fatti
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 4-2-3-1 1.67 1.63 0 4
## 2 4-3-3 2.21 1.49 0 6
## # ℹ 8 more variables: media_gol_subiti <dbl>, deviazione_gol_subiti <dbl>,
## # min_gol_subiti <dbl>, max_gol_subiti <dbl>, media_possesso_palla <dbl>,
## # deviazione_possesso_palla <dbl>, min_possesso_palla <dbl>,
## # max_possesso_palla <dbl>
we can see that the performance of the 4-3-3 was better, in fact this was the scheme Spalletti used for the whole season
library(ggplot2)
ggplot(data = performance_formazione, aes(x = Formazione, y = media_gol_fatti)) +
geom_bar(stat = "identity", fill = "blue") +
labs(x = "Formazione", y = "Media Gol Fatti") +
ggtitle("Media Gol Fatti per Formazione")
ggplot(data = performance_formazione, aes(x = Formazione, y = media_gol_subiti)) +
geom_bar(stat = "identity", fill = "red") +
labs(x = "Formazione", y = "Media Gol Subiti") +
ggtitle("Media Gol Subiti per Formazione")
ggplot(data = performance_formazione, aes(x = Formazione, y = media_possesso_palla)) +
geom_bar(stat = "identity", fill = "green") +
labs(x = "Formazione", y = "Media Possesso Palla") +
ggtitle("Media Possesso Palla per Formazione")
#Shot analysis
data <- table[[5]]
playername=data$Giocatore[-(28:29)]
data <- data[, -(1:4)]
data <- data[, -ncol(data)]
data <- apply(data, 2, function(x) as.numeric(gsub(",", ".", x, fixed = TRUE)))
data <- apply(data, 2, function(x) replace(x, is.na(x), 0))
correlation_matrix <- as.data.frame(cor(data))
print(correlation_matrix)
## 90 min Reti Tiri Tiri % TiP Tiri/90
## X90.min 1.00000000 0.4687933 0.49666620 0.48586536 0.04038033 0.14574774
## Reti 0.46879330 1.0000000 0.96870250 0.98488673 0.09219610 0.78589871
## Tiri 0.49666620 0.9687025 1.00000000 0.99501131 0.07367623 0.80458295
## Tiri.1 0.48586536 0.9848867 0.99501131 1.00000000 0.09823111 0.80977925
## X..TiP 0.04038033 0.0921961 0.07367623 0.09823111 1.00000000 0.40611479
## Tiri.90 0.14574774 0.7858987 0.80458295 0.80977925 0.40611479 1.00000000
## TiP.90 -0.06140082 0.4666269 0.46153379 0.47869940 0.72403998 0.81514186
## G.Tiri 0.29812387 0.1899464 0.10745303 0.12964190 0.36022018 0.07312157
## G.TiP 0.32193687 0.1529392 0.09959511 0.09855918 0.11930890 -0.02693894
## Dist. 0.10677676 0.1122464 0.14961482 0.13991755 0.32985611 0.33169840
## Pun. 0.45397137 0.9034364 0.94816472 0.94787623 0.07548446 0.76311636
## Rigori 0.37593121 0.9354891 0.86933119 0.89702840 0.05631689 0.71097613
## Rig.T 0.38764891 0.9480473 0.89150953 0.91369204 0.05196559 0.72971455
## xG 0.48448078 0.9913056 0.98920716 0.99485439 0.08217439 0.80275967
## npxG 0.49227646 0.9867483 0.99189453 0.99529051 0.08536446 0.80408477
## npxG.Sh 0.40547419 0.3209345 0.27794399 0.29639923 0.61963034 0.41808416
## G.xG 0.23068446 0.7158133 0.53989498 0.60665296 0.11492639 0.43681349
## np.G.xG 0.24416665 0.7403772 0.56731993 0.63181151 0.11442556 0.45964655
## TiP/90 G/Tiri G/TiP Dist. Pun. Rigori
## X90.min -0.06140082 0.29812387 0.32193687 0.10677676 0.45397137 0.37593121
## Reti 0.46662686 0.18994644 0.15293924 0.11224643 0.90343636 0.93548908
## Tiri 0.46153379 0.10745303 0.09959511 0.14961482 0.94816472 0.86933119
## Tiri.1 0.47869940 0.12964190 0.09855918 0.13991755 0.94787623 0.89702840
## X..TiP 0.72403998 0.36022018 0.11930890 0.32985611 0.07548446 0.05631689
## Tiri.90 0.81514186 0.07312157 -0.02693894 0.33169840 0.76311636 0.71097613
## TiP.90 1.00000000 0.07310766 -0.08284332 0.15035470 0.43650310 0.41850063
## G.Tiri 0.07310766 1.00000000 0.82715690 0.13162933 0.03130120 0.12064582
## G.TiP -0.08284332 0.82715690 1.00000000 0.17675059 0.02181115 0.10236801
## Dist. 0.15035470 0.13162933 0.17675059 1.00000000 0.20040735 0.13409806
## Pun. 0.43650310 0.03130120 0.02181115 0.20040735 1.00000000 0.83111506
## Rigori 0.41850063 0.12064582 0.10236801 0.13409806 0.83111506 1.00000000
## Rig.T 0.42569926 0.11547589 0.10051635 0.13393697 0.84115765 0.99365254
## xG 0.46929915 0.13690239 0.11550470 0.12938830 0.92164745 0.90970479
## npxG 0.47019898 0.13855559 0.11653100 0.12744225 0.92283038 0.88906188
## npxG.Sh 0.40674112 0.65928106 0.48169642 0.45560115 0.20669641 0.27040881
## G.xG 0.29830526 0.40870665 0.30108427 -0.01580362 0.50836002 0.76330834
## np.G.xG 0.31003813 0.41247035 0.30483179 -0.01584414 0.52683570 0.77657737
## Rig T xG npxG npxG/Sh G-xG np:G-xG
## X90.min 0.38764891 0.48448078 0.49227646 0.4054742 0.23068446 0.24416665
## Reti 0.94804730 0.99130564 0.98674830 0.3209345 0.71581326 0.74037716
## Tiri 0.89150953 0.98920716 0.99189453 0.2779440 0.53989498 0.56731993
## Tiri.1 0.91369204 0.99485439 0.99529051 0.2963992 0.60665296 0.63181151
## X..TiP 0.05196559 0.08217439 0.08536446 0.6196303 0.11492639 0.11442556
## Tiri.90 0.72971455 0.80275967 0.80408477 0.4180842 0.43681349 0.45964655
## TiP.90 0.42569926 0.46929915 0.47019898 0.4067411 0.29830526 0.31003813
## G.Tiri 0.11547589 0.13690239 0.13855559 0.6592811 0.40870665 0.41247035
## G.TiP 0.10051635 0.11550470 0.11653100 0.4816964 0.30108427 0.30483179
## Dist. 0.13393697 0.12938830 0.12744225 0.4556011 -0.01580362 -0.01584414
## Pun. 0.84115765 0.92164745 0.92283038 0.2066964 0.50836002 0.52683570
## Rigori 0.99365254 0.90970479 0.88906188 0.2704088 0.76330834 0.77657737
## Rig.T 1.00000000 0.93047839 0.91146674 0.2733376 0.72811919 0.74924027
## xG 0.93047839 1.00000000 0.99880437 0.3145316 0.61770893 0.64587344
## npxG 0.91146674 0.99880437 1.00000000 0.3168178 0.59681643 0.62557005
## npxG.Sh 0.27333757 0.31453164 0.31681780 1.0000000 0.24890108 0.25603993
## G.xG 0.72811919 0.61770893 0.59681643 0.2489011 1.00000000 0.99734132
## np.G.xG 0.74924027 0.64587344 0.62557005 0.2560399 0.99734132 1.00000000
I checked the correlation between each variable about shooting to see if there is something worth to analyze. Actually the result are really predictable so we don’t need any further analysis, the only thing interesting is that the correlation between the mean distance of shooting and the goal kick taken was lower than i thought. The designed player to take goal kick are Zielinski and Politano that usually shot from a distance or at the end of the penalty area. i think the reason for the low correlation is that sometimes Victor Osimhen take goal kick, with no great results, but since he is the player that shot from the shorter distance he is the main reason of why there is no correlation ###Cluster
independent_vars <- c("Tiri","% TiP", "Tiri/90", "TiP/90", "G/Tiri", "G/TiP", "Dist.", "Pun.", "Rigori", "Rig T", "xG", "npxG", "npxG/Sh", "G-xG", "np:G-xG")
data=as.data.frame(data)
data <- data.frame(Reti = data$Reti, data[, independent_vars])
data <- data[-(28:29),]
data <- apply(data, 2, as.numeric)
standardized_data <- scale(data)
k_values <- 2:10 # Range of k values to evaluate
wss <- vector("numeric", length = length(k_values)) # Vector to store within-cluster sum of squares
for (i in 1:length(k_values)) {
kmeans_result <- kmeans(standardized_data, centers = k_values[i])
wss[i] <- kmeans_result$tot.withinss
}
plot(k_values, wss, type = "b", pch = 19, frame = FALSE, xlab = "Number of Clusters (k)", ylab = "Within-Cluster Sum of Squares (WSS)", main = "Elbow Method")
elbow_index <- which(diff(wss) < mean(diff(wss))) # Find the index where the rate of change drops significantly
optimal_k <- k_values[elbow_index] # Select the corresponding k value
cat("Optimal number of clusters:", optimal_k, "\n")
## Optimal number of clusters: 2 3 5 8
optimal_k <- 3
kmeans_result <- kmeans(standardized_data, centers = optimal_k)
cluster_assignments <- kmeans_result$cluster
data_with_clusters <- bind_cols(data, Cluster = cluster_assignments)
independent_vars <- c("Tiri","X..TiP", "Tiri.90", "TiP.90", "G.Tiri", "G.TiP", "Dist.", "Pun.", "Rigori", "Rig.T", "xG", "npxG", "npxG.Sh", "G.xG", "np.G.xG")
for (var in independent_vars) {
plot_data <- bind_cols(data_with_clusters, Variable = data[, var])
plot_data$Player <- playername
plot <- ggplot(plot_data, aes(x = Reti, y = Variable, color = as.factor(Cluster), label = Player)) +
geom_point(alpha = 0.5) +
geom_text(vjust = -1) + # Adjust the position of the player name labels
labs(title = paste("Clustering Analysis for", var),
x = "Reti",
y = var,
color = "Cluster") +
theme_minimal()
print(plot)
}
the cluster analysis divide our player in 3 group, the first one has the
defend with low partecipation on attacking, the second cluster is really
big and have the majority of the player, instead the third one have our
top player Kvara and Osimhen. I plotted the most important variable
regarding the shooting ###PCA
data_pca <- data[, independent_vars]
pca_result <- PCA(data_pca, graph = FALSE)
fviz_eig(pca_result, addlabels = TRUE)
biplot <- fviz_pca_biplot(pca_result, label = "var", repel = TRUE)
biplot <- biplot + geom_text(aes(label = playername), hjust = 0.5, vjust = -0.5, size = 3, color = "black")
print(biplot)
the pca analysis synthetize all the plot we saw before in only one
graphic, we can see how Zanoli and Gaetano as subbed in try to seize the
opportunity, on the -,- part of the cartesian plan there are the player
not present in goal zone, instead obv we have Osimhen and Kvaratskhelia
with the best stats. Worth to mention Elmas and Politano that had a good
overperformance and Simeone and Raspadori that as you can see have
really good stats as subbed in, from the plot we can see that they know
were to stand, waiting for a goal oppportunity, and when it comes they
don’t miss it, in fact they scored 2/3 goal each worth 3 point, deciding
a match on the last minute
#Passing analysis
i made the same correlation matrix for the passes
data2= table[[6]]
data2 <- data2[, -(2:4)]
data2 <- data2[, -ncol(data2)]
data2 =as.data.frame(data2[-(28:29),])
playername=data2$Giocatore
data2 <- apply(data2, 2, as.numeric)
## Warning in apply(data2, 2, as.numeric): NA introdotti per coercizione
## Warning in apply(data2, 2, as.numeric): NA introdotti per coercizione
## Warning in apply(data2, 2, as.numeric): NA introdotti per coercizione
## Warning in apply(data2, 2, as.numeric): NA introdotti per coercizione
## Warning in apply(data2, 2, as.numeric): NA introdotti per coercizione
data2 <- apply(data2, 2, function(x) replace(x, is.na(x), 0))
data2=as.data.frame(data2)
completion_rate <- data2$Compl. / data2$`Tent,`
hist(completion_rate, breaks = 10, col = "lightblue", main = "Completion Rate Distribution")
# Player Performance Analysis
player_performance <- data.frame(Player = playername, CompletionRate = completion_rate)
# Top 5 players with highest completion rates
top_players <- player_performance[order(player_performance$CompletionRate, decreasing = TRUE), ][1:5, ]
print(top_players)
## Player CompletionRate
## 22 Diego Demme 0.9476440
## 4 Stanislav Lobotka 0.9400261
## 8 Amir Rrahmani 0.9123212
## 3 Kim Min-jae 0.9075391
## 15 Juan Jesus 0.9016173
i put a lot of interest in the completation rate of passing, because in our team is a main factor and that’s the reason Lobotka is one of the key players of the team
player_performance <- data.frame(Player = playername, CompletionRate = completion_rate,dist_tot=data2$`Dist. Tot.`, completions=data2$Compl., XAG=data2$xAG)
boxplot(CompletionRate ~ Player, data = player_performance,
main = "Completion Rate by Player", xlab = "Player", ylab = "Completion Rate",
las = 2)
unique_players <- unique(player_performance$Player)
player_positions <- 1:length(unique_players)
text(player_positions, par("usr")[3] - 0.1, labels = unique_players, srt = 90, adj = c(1, 0.5), xpd = TRUE)
i used a boxplot to plot all the information in one graphic
###Cluster
independent_vars <- c("PrgP", "Dist. Tot.", "Dist. Prog.","xAG","PF","PPA","Cross in area")
data <- data.frame(CompletionRate = completion_rate, data2[, independent_vars])
data <- data[-(28:29),]
data <- apply(data, 2, as.numeric)
standardized_data <- scale(data)
k_values <- 2:10
wss <- vector("numeric", length = length(k_values))
for (i in 1:length(k_values)) {
kmeans_result <- kmeans(standardized_data, centers = k_values[i])
wss[i] <- kmeans_result$tot.withinss
}
plot(k_values, wss, type = "b", pch = 19, frame = FALSE, xlab = "Number of Clusters (k)", ylab = "Within-Cluster Sum of Squares (WSS)", main = "Elbow Method")
elbow_index <- which(diff(wss) < mean(diff(wss))) # Find the index where the rate of
optimal_k <- k_values[elbow_index]
cat("Optimal number of clusters:", optimal_k, "\n")
## Optimal number of clusters: 2 3 4
optimal_k <- 3
kmeans_result <- kmeans(standardized_data, centers = optimal_k)
cluster_assignments <- kmeans_result$cluster
data_with_clusters <- bind_cols(data, Cluster = cluster_assignments)
independent_vars <- c("PrgP", "Dist..Tot.", "Dist..Prog.","xAG","PF","PPA","Cross.in.area")
for (var in independent_vars) {
plot_data <- bind_cols(data_with_clusters, Variable = data[, var])
plot_data$Player <- playername
plot <- ggplot(plot_data, aes(x = CompletionRate, y = Variable, color = as.factor(Cluster), label = Player)) +
geom_point(alpha = 0.5) +
geom_text(vjust = -1) +
labs(title = paste("Clustering Analysis for", var),
x = "Completion Rate",
y = var,
color = "Cluster") +
theme_minimal()
print(plot)
}
this cluster give us a clear differentation: the firsr cluster is about
player who pass a lot to set up the game rather than to assist, for
instance DC, CMD and Oliveira that as a fullback is the one that is more
focused on defend. The second cluster is about the player that don’t
pass a lot, let’s say they are more incline to receive rather than to
pass or they are not part of the main players. The last cluster is about
fullback, wings and TRQ: so player that cross a lot and give assist to
our forward I plotted the most important variable regarding the passing
###Passing Distribution
data_contrasts <- data2[, c("Compl.", "Compl.", "Compl.")]
data_contrasts$Giocatore <- playername
data_contrasts_long <- reshape2::melt(data_contrasts, id.vars = "Giocatore")
variable <- c("pass brevi", "pass medi", "pass lunghi")
contrasts_plot <- ggplot(data_contrasts_long, aes(x = Giocatore, y = value, fill = variable)) +
geom_bar(stat = "identity") +
labs(title = "Passing Distribution",
x = "Players",
y = "Value") +
scale_fill_manual(values = c("darkgreen", "darkblue", "darkorange"),
labels = c("pass brevi", "pass medi", "pass lunghi")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
axis.title.y = element_blank())
print(contrasts_plot)
just to visualize what type of pass each player mostly uses
###PCA
data_pca <- data[, independent_vars]
pca_result <- PCA(data_pca, graph = FALSE)
fviz_eig(pca_result, addlabels = TRUE)
biplot <- fviz_pca_biplot(pca_result, label = "var", repel = TRUE)
biplot <- biplot + geom_text(aes(label = playername), hjust = 0.5, vjust = -0.5, size = 3, color = "black")
print(biplot)
In the PCA analysis the differentation is more clear, on the +,+ side we
have the playmakers of the team, with the majority of passes and
progressiv passes, instead in the lower quadrant we have the player that
create the majority of goal chance, that include pass inside the penalty
area the pass who lead to a shot ###Type of pass
data=table[[7]]
data <- apply(data, 2, as.numeric)
## Warning in apply(data, 2, as.numeric): NA introdotti per coercizione
## Warning in apply(data, 2, as.numeric): NA introdotti per coercizione
## Warning in apply(data, 2, as.numeric): NA introdotti per coercizione
## Warning in apply(data, 2, as.numeric): NA introdotti per coercizione
data=as.data.frame(data[-(28:29),])
data=data[,-(2:4)]
data=data[,-(18)]
variables <- c( "Pun.", "PassFil", "Scambi", "Cross", "Rimesse in gioco", "Angoli", "Conv.", "Div.", "Dir.", "Compl.", "Fuorigioco", "Blocchi")
data$Giocatore=playername
for (variable in variables) {
variable_data <- subset(data, select = c("Giocatore", variable))
pass_data_long <- reshape2::melt(variable_data, id.vars = "Giocatore")
pass_plot <- ggplot(pass_data_long, aes(x = Giocatore, y = value)) +
geom_bar(stat = "identity", width = 0.5) +
labs(title = paste("Type of pass =", variable),
x = "Players",
y = "Pass Value") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) # Rotate x-axis labels if needed
print(pass_plot)
}
i just plotted the stats for type of passes for each player for a better
visualizzation
correlation_matrix <- as.data.frame(cor(data[,-(1)]))
print(correlation_matrix)
## 90 min Tent, In gioco Non in gioco Pun.
## 90 min 1.0000000 0.8647042 0.8546715 0.5412497 0.6664147
## Tent, 0.8647042 1.0000000 0.9952086 0.5660018 0.8119927
## In gioco 0.8546715 0.9952086 1.0000000 0.4827005 0.7708628
## Non in gioco 0.5412497 0.5660018 0.4827005 1.0000000 0.7755663
## Pun. 0.6664147 0.8119927 0.7708628 0.7755663 1.0000000
## PassFil 0.4091665 0.3185861 0.3098044 0.2353219 0.1287241
## Scambi 0.5977919 0.7889197 0.7730088 0.5457114 0.8175156
## Cross 0.3982986 0.3719994 0.3257459 0.5769347 0.3933497
## Rimesse in gioco 0.3214153 0.4636284 0.3932978 0.8322983 0.5330990
## Angoli 0.1874191 0.1889013 0.1579607 0.3562359 0.3178728
## Conv. 0.1549125 0.1529393 0.1226886 0.3306280 0.2906833
## Div. 0.1775407 0.2104151 0.1780834 0.3811375 0.3667245
## Dir. 0.1532424 0.1443407 0.1294540 0.2017404 0.2010421
## Compl. 0.8526916 0.9975471 0.9967668 0.5314481 0.8038232
## Fuorigioco 0.5077411 0.6559175 0.6127612 0.6981589 0.6093722
## Blocchi 0.6337811 0.6073953 0.5697289 0.6304422 0.4325600
## PassFil Scambi Cross Rimesse in gioco Angoli
## 90 min 0.40916654 0.59779190 0.3982986 0.32141528 0.1874191
## Tent, 0.31858615 0.78891970 0.3719994 0.46362842 0.1889013
## In gioco 0.30980436 0.77300876 0.3257459 0.39329783 0.1579607
## Non in gioco 0.23532189 0.54571138 0.5769347 0.83229831 0.3562359
## Pun. 0.12872408 0.81751559 0.3933497 0.53309904 0.3178728
## PassFil 1.00000000 0.33160312 0.5093173 0.26398390 0.2700727
## Scambi 0.33160312 1.00000000 0.4066475 0.51189766 0.1807330
## Cross 0.50931731 0.40664755 1.0000000 0.50422531 0.7625191
## Rimesse in gioco 0.26398390 0.51189766 0.5042253 1.00000000 0.1005320
## Angoli 0.27007269 0.18073295 0.7625191 0.10053205 1.0000000
## Conv. 0.25859564 0.18372473 0.7693957 0.08858179 0.9760317
## Div. 0.15184755 0.17742109 0.6792728 0.12156573 0.9555835
## Dir. 0.04733369 -0.02816788 0.4640606 -0.06918469 0.8334693
## Compl. 0.28335298 0.77770124 0.3133713 0.42714237 0.1526506
## Fuorigioco 0.47746525 0.68740297 0.7591123 0.75462377 0.3997177
## Blocchi 0.47915254 0.49930914 0.7608618 0.72282991 0.3365709
## Conv. Div. Dir. Compl. Fuorigioco
## 90 min 0.15491249 0.1775407 0.15324236 0.8526916 0.5077411
## Tent, 0.15293927 0.2104151 0.14434068 0.9975471 0.6559175
## In gioco 0.12268857 0.1780834 0.12945397 0.9967668 0.6127612
## Non in gioco 0.33062803 0.3811375 0.20174044 0.5314481 0.6981589
## Pun. 0.29068327 0.3667245 0.20104215 0.8038232 0.6093722
## PassFil 0.25859564 0.1518476 0.04733369 0.2833530 0.4774652
## Scambi 0.18372473 0.1774211 -0.02816788 0.7777012 0.6874030
## Cross 0.76939574 0.6792728 0.46406056 0.3133713 0.7591123
## Rimesse in gioco 0.08858179 0.1215657 -0.06918469 0.4271424 0.7546238
## Angoli 0.97603168 0.9555835 0.83346935 0.1526506 0.3997177
## Conv. 1.00000000 0.8789291 0.74779742 0.1146656 0.4083650
## Div. 0.87892907 1.0000000 0.85559981 0.1807921 0.3470736
## Dir. 0.74779742 0.8555998 1.00000000 0.1298695 0.1394948
## Compl. 0.11466560 0.1807921 0.12986951 1.0000000 0.6163767
## Fuorigioco 0.40836505 0.3470736 0.13949481 0.6163767 1.0000000
## Blocchi 0.33982457 0.2734403 0.18101410 0.5588698 0.7889576
## Blocchi
## 90 min 0.6337811
## Tent, 0.6073953
## In gioco 0.5697289
## Non in gioco 0.6304422
## Pun. 0.4325600
## PassFil 0.4791525
## Scambi 0.4993091
## Cross 0.7608618
## Rimesse in gioco 0.7228299
## Angoli 0.3365709
## Conv. 0.3398246
## Div. 0.2734403
## Dir. 0.1810141
## Compl. 0.5588698
## Fuorigioco 0.7889576
## Blocchi 1.0000000
seeing the corr matrix i didn’t find anything worth to analyze #Creation of goal or shot opportunities
variables <- c("SCA", "SCA90", "PassaggiInGioco", "PassaggiNonInGioco", "A", "Tiri", "Falli", "Def.", "GCA", "GCA90")
data=table[[8]]
data <- apply(data, 2, function(x) as.numeric(gsub(",", ".", x, fixed = TRUE)))
## Warning in FUN(newX[, i], ...): NA introdotti per coercizione
## Warning in FUN(newX[, i], ...): NA introdotti per coercizione
## Warning in FUN(newX[, i], ...): NA introdotti per coercizione
## Warning in FUN(newX[, i], ...): NA introdotti per coercizione
data=as.data.frame(data[-(28:29),])
data=data[,-(1:4)]
data=data[,-(22)]
data_pca <- data[, variables]
pca_result <- PCA(data_pca, graph = FALSE)
fviz_eig(pca_result, addlabels = TRUE)
biplot <- fviz_pca_biplot(pca_result, label = "var", repel = TRUE)
biplot <- biplot + geom_text(aes(label = playername), hjust = 0.5, vjust = -0.5, size = 3, color = "black")
print(biplot)
The Scree plot shows the variance explained by each principal component,
allowing you to determine how many principal components to consider The
biplot displays the observations (players) and inplane variables of the
first two principal components, allowing to identify potential patterns
or clusters. We can see how kvaratskelia is the key player of the team,
the one that create most goal or shot opportunities #Defensive
action
variables <- c("Cntrs", "Cntrs.1", "Blocchi", "Int", "Salvat.")
data=table[[9]]
data <- apply(data, 2, function(x) as.numeric(gsub(",", ".", x, fixed = TRUE)))
## Warning in FUN(newX[, i], ...): NA introdotti per coercizione
## Warning in FUN(newX[, i], ...): NA introdotti per coercizione
## Warning in FUN(newX[, i], ...): NA introdotti per coercizione
## Warning in FUN(newX[, i], ...): NA introdotti per coercizione
data=as.data.frame(data[-(28:29),])
data=data[,-(1:4)]
data=data[,-(22)]
data_pca <- data[, variables]
pca_result <- PCA(data_pca, graph = FALSE)
fviz_eig(pca_result, addlabels = TRUE)
biplot <- fviz_pca_biplot(pca_result, label = "var", repel = TRUE)
biplot <- biplot + geom_text(aes(label = playername), hjust = 0.5, vjust = -0.5, size = 3, color = "black")
print(biplot)
from this plot we can see how our rearguard works, we have the CDM that
have the job to intercept any loose ball and to stop the other team
dribblers. From our fullback we can see how Di Lorenzo is more likely to
incercept the ball to go for a counterattack, instead as a TS even if
Mario Rui played more, Oliveira has better defensive stats(they are not
for 90 min) that means that means that the defensive contribution of
Mario is really poor. Worth to mention Kvaraskheila that recovered a lot
of ball in the last quarter of the field and Lozano: even if his scoring
contribution was insufficient this year he made a lot of dirty work,
sometimes acting as a fullback when Di Lorenzo was attacking and doing a
great job in intercepting ball, for instance we score against Frankufurt
and Cremonese thanks to the ball he recovered when doing high intensity
pressing and lead us to score important goals ###Distribution of were
contrast happen
data_contrasts <- data[, c("Treq. dif.", "Treq. cen.", "Treq. off.")]
data_contrasts$Giocatore <- playername
data_contrasts_long <- reshape2::melt(data_contrasts, id.vars = "Giocatore")
contrasts_plot <- ggplot(data_contrasts_long, aes(x = Giocatore, y = value, fill = variable)) +
geom_bar(stat = "identity") +
labs(title = "Contrasts Distribution",
x = "Players",
y = "Value") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
axis.title.y = element_blank())
print(contrasts_plot)
#Ball Controll
data=table[[10]]
data <- apply(data, 2, function(x) as.numeric(gsub(",", ".", x, fixed = TRUE)))
## Warning in FUN(newX[, i], ...): NA introdotti per coercizione
## Warning in FUN(newX[, i], ...): NA introdotti per coercizione
## Warning in FUN(newX[, i], ...): NA introdotti per coercizione
## Warning in FUN(newX[, i], ...): NA introdotti per coercizione
data=as.data.frame(data[-(28:29),])
data=data[,-(1:4)]
data=data[,-(25)]
variables <- c("Tocchi", "Att", "Tkld", "Controlli palla", "Dist. Tot.","Dist. Prog.","Ric.")
data_pca <- data[, variables]
pca_result <- PCA(data_pca, graph = FALSE)
fviz_eig(pca_result, addlabels = TRUE)
biplot <- fviz_pca_biplot(pca_result, label = "var", repel = TRUE)
biplot <- biplot + geom_text(aes(label = playername), hjust = 0.5, vjust = -0.5, size = 3, color = "black")
print(biplot)
the ball controll analysis using PCA give us a full understanding of the
study using only 2 dimensions. Kvara is by far the best about taking
possession of the ball. on the +,- quadrant we find the player who are
at the center of Napoli’s ball possession, instead Anguissa exceed in
both so he is put in the middle ###Distribution of touch according to
the area of the field
data_contrasts <- data[, c("Area dif.", "Treq. dif.", "Treq. cen.","Treq. off.","Area off.")]
data_contrasts$Giocatore <- playername
data_contrasts_long <- reshape2::melt(data_contrasts, id.vars = "Giocatore")
contrasts_plot <- ggplot(data_contrasts_long, aes(x = Giocatore, y = value, fill = variable)) +
geom_bar(stat = "identity") +
labs(title = "Contrasts Distribution",
x = "Players",
y = "Value") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
axis.title.y = element_blank())
print(contrasts_plot)
#Player with most PAP(point per match) with at least 10 games as a
starter
data=table[[11]]
data=data[,1:16]
data=as.data.frame(data[-(17:32),])
data$Tit <- as.numeric(data$Tit)
data_filtered <- data %>%
filter(Tit >= 10)
data_sorted <- data_filtered %>%
arrange(desc(PAP))
top_5_players <- data_sorted %>%
top_n(5, PAP) %>%
select(c(1, 10, 16))
print(top_5_players)
## Giocatore Tit PAP
## 1 Mário Rui 21 2,77
## 2 Alex Meret 34 2,44
## 3 Giovanni Di Lorenzo 36 2,41
## 4 Hirving Lozano 20 2,41
## 5 Kim Min-jae 35 2,40
Mario rui is our lucky charm #Relationship between Fouls and Yellow Card
data=table[[12]]
data=data[,-10]
data=as.data.frame(data[-(28:29),])
data$Falli=as.numeric(data$Falli)
data$Amm.=as.numeric(data$Amm.)
scatter_plot <- ggplot(data, aes(x = Falli, y = Amm., label = Giocatore)) +
geom_point() +
geom_text(hjust = -0.2, vjust = -0.2) +
geom_smooth(method = "lm", se = FALSE, color = "blue") + # Add regression line
labs(title = "Relationship between Fouls and Yellow Cards",
x = "Fouls",
y = "Yellow Cards") +
theme_minimal()
print(scatter_plot)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: label
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
correlation <- cor(data$Falli, data$Amm.)
# Print the correlation coefficient
print(correlation)
## [1] 0.7502612